// Copyright (C) 2003
// Gerhard Neumann (gneumann@gmx.net)
// Stephan Neumann (sneumann@gmx.net) 
//                
// This file is part of RL Toolbox.
// http://www.igi.tugraz.at/ril_toolbox
//
// All rights reserved.
// 
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// USE OF THE EXAMPLE
// cartpoleQRBFLearning [-d debugfile]
// 
// This example shows how to learn the CartPole Swing Up Task with Q-Function Learning using RBFs

#include <time.h>
#include <stdio.h>

#include "ccartpolemodel.h"
#include "cpolicies.h"
#include "cagent.h"
#include "ril_debug.h"
#include "ctdlearner.h"
#include "clinearfafeaturecalculator.h"
#include "cvfunctionlearner.h"
#include "crewardmodel.h"
#include "ctorchvfunction.h"
#include "canalyzer.h"
#include "cmontecarlo.h"

#include <math.h>


// This is the entry point for this application
int main( int argc, char **argv )
{
	// Initialize the random generator 
	srand(1000);

	printf ("-=<   Reinforcement Learning Benchmark - Cartpole Swing Up with V-Function Learning using RBFs >=-\n\n");

	int arg = 1;

	// Console Input Processing
	while (arg < argc - 1)
	{
		if (strcmp(argv[arg], "-d") == 0)
		{
			// The "-d" option enables debugging
			arg ++;
			char *debugFile = argv[arg];
			DebugInit(debugFile, "+", false);
		}
		arg ++;
	}

	// Create our CartPole Model with the timestep 0,05. All other physical values are taken by default (see Docu or source).
	CCartPoleModel *cartpoleModel = new CCartPoleModel(0.05);

	// Use 100 simulation steps of the model for one time step. This helps to overcome the numerical inaccurateness.
	cartpoleModel->setSimulationSteps(100);
	// Take random reset states, for the cartpole model, the only the angle is choosen randomly, the velocity is always 0 at the beginning
	cartpoleModel->setResetType(DM_RESET_TYPE_RANDOM);


	// Create the environment for the agent, the environment saves the current state of the agent.
	CTransitionFunctionEnvironment *environmentModel = new CTransitionFunctionEnvironment(cartpoleModel);


	// Create our reward function for the cartpole (the reward depends on the height of the pole)
	CRewardFunction *rewardFunction = new CCartPoleRewardFunction(cartpoleModel);

	// Create the agent in our environmentModel.
	CAgent *agent = new CAgent(environmentModel);

	// Now we can already create our RBF network
	// Therefore we will use a CRBFFeatureCalculator, our feature calculator uses both dimensions of the model state 
	// (the angel and the angular velocity) and lays a 20x20 RBF grid over the state space. For each dimension the given sigmas are used.
	// For the calculation of useful sigmas we have to consider that the CRBFFeatureCalculator always uses the 
	// normalized state representation, so the state variables are scaled to the intervall [0,1]
	unsigned int dimensions[] = {0, 1, 2, 3};
	unsigned int partitions[] = {7, 7, 15, 15};
	double offsets[] = {0.0, 0.0, 0.0, 0.0};
	double sigma[] = {0.07, 0.07, 0.03, 0.03};

	// Now we can create our Feature Calculator
	CFeatureCalculator *rbfCalc = new CRBFFeatureCalculator(4, dimensions, partitions, offsets, sigma);

	// Of course we have to add the feature calculator to the agent state modifiers list.
	agent->addStateModifier(rbfCalc);
	

	// Now we can create our possible actions, we give the agent 3 actions to choose : -uMax, 0, uMax
	// Since we deal with continuous actions we have to define the action values ourself.
	double action1[] = {-cartpoleModel->uMax};
	double action2[] = {+cartpoleModel->uMax};
	double action3[] = {0};

	CStaticContinuousAction *minContAction = new CStaticContinuousAction(cartpoleModel->getContinuousAction(), action1);
	CStaticContinuousAction *maxContAction = new CStaticContinuousAction(cartpoleModel->getContinuousAction(), action2);
	CStaticContinuousAction *nullContAction = new CStaticContinuousAction(cartpoleModel->getContinuousAction(), action3);

	// Add all actoins to the agent
	agent->addAction(minContAction);
	agent->addAction(maxContAction);
	agent->addAction(nullContAction);

	// Disable Episode logging
	agent->setLogEpisode(false);


	// Create our feature Q-Function using our RBF net as feature calculator and the created actions.
	CFeatureQFunction *qFunction = new CFeatureQFunction(agent->getActions(), rbfCalc);

	// Create the Q-Function Learner, we use a standard Q-Learner here.
	// The Q-Function Learner needs the reward function and the Q-Function objects. Since it is a Q-Learner it uses
	// a greedy policy as estimation policy.
	CQLearner *learner = new CQLearner(rewardFunction, qFunction);
	// Add the learner to the agent listeners list
	agent->addSemiMDPListener(learner);

	// Create the Controller for the agent from the QFunction.
	// We will use a SoftMax-Policy for exploration.
	CAgentController *qFunctionPolicy = new CQStochasticPolicy(agent->getActions(), new CSoftMaxDistribution(5.0), qFunction);
	// Set the policy
	agent->setController(qFunctionPolicy);

	// Set the learningrate parameter
	learner->setParameter("QLearningRate", 0.25);

	// To trace the learning process we will use a policy evaluator. Here we will use average reward policy evaluator, 
	// which calculates the average reward of 10 episodes, each taking 10 seconds. In order not to falsify the learning process, we have to
	// disable learning before each policy evaluation.

	CPolicyEvaluator *policyEvaluator = new CAverageRewardCalculator(agent, rewardFunction, 10, my_round(10.0 / cartpoleModel->getTimeIntervall()));
	// Start Learning now, learn 5000 Episodes

	for (int i = 0; i < 5000; i++)
	{
		// Start new Episode, the agent's angle will be choosen randomly and the velocity is set to 0.
		agent->startNewEpisode();
		// Learn 1 Episode with 200 steps 
		agent->doControllerEpisode(1, 200);


		// Disable Learning now for policy evaluation
		agent->removeSemiMDPListener(learner);
		printf("Episode %d (Steps %d): Average Reward: %f, Failed %d\n", i, i * 200, policyEvaluator->evaluatePolicy(), environmentModel->isFailed());
		// Enable Learning again
		agent->addSemiMDPListener(learner);

		printf("\n");
	}

	// Save the learned data

	FILE *results = fopen("QFunctionCartPole.table", "w");

	
	qFunction->saveData(results);

	fclose(results);

	printf("Finished Learning\n");
	printf("\n\n<< Press Enter >>\n");
	getchar();

	// Cleaning up
	delete agent;
	delete environmentModel;
	delete cartpoleModel;
	delete qFunction;
	delete learner;
	delete qFunctionPolicy;
}
